# load the config file
yaml.file <- yaml.load_file('config_ongoing_run.yaml')
meta <- read.table(yaml.file$METAFILE, sep='\t', header=T)
#Argument Parser
RDATA_DMT <- params$param1
CONDI <- params$param2
RDATA_PREP_FIGS <- params$param3
PLOT_RDATA <- params$param4
OUTPUT_PATH <- params$param5
control.cond <- unlist(strsplit(CONDI, "_"))[1]
treat.cond <- unlist(strsplit(CONDI, "_"))[2]
mark <- unlist(strsplit(CONDI, "_"))[3]
LEVEL <- yaml.file$LEVEL
CUSTOM_ANNOT <- yaml.file$CUSTOM_ANNOT
# colors - pinkish
#full.color = "#de0000"
#high.color = "#ff8092"
#mid.color = "#f5e0e8"
#low.color = "#996d9a"
#unmeth.color = "#0a1661"
#hyper.color = "#de0000"
#hypo.color = "#0a1661"
#not.color = "#f5e0e8"
# moins rose
full.color = "#de0000"
high.color = "#e17e65"
mid.color = "#c6c6c6"
low.color = "#7c6fac"
unmeth.color = "#0f208f"
hyper.color = "#de0000"
hypo.color = "#0f208f"
not.color = "#c6c6c6"
# couleurs Elouan
#full.color = "#342A1F",
#high.color = "#C59434",
#mid.color = "#6F7498",
#low.color = "#A3B7F9",
#unmeth.color = "#F4D4D4"
titre <- paste0("Differential Methylation Analysis on ", LEVEL, " (", CONDI, ")")
MKit_differential.Rmd : Developped & Maintained by Olivier Kirsh and Elouan Bethuel.
Perform a differential methylation analysis between pair of conditions.
Launch as Rscript on HPC cluster with the workflow : WGBSflow.
Requires 2 RData environments produced by the following R scripts :
- MKit_diff_bed.R which performs MethylKit DM CpG (DMC) or Tiles (DMT) analysis,
- MKit_diff_fig.R which builds all dataframes (wide and long format) needed to build the figures.
#load packages
library(ggplot2)
library(dbplyr)
library(magrittr)
library(methylKit)
library(yaml)
library(stringr)
load(RDATA_DMT)
load(RDATA_PREP_FIGS)
# extract the information from the yaml file
MINCOV <- yaml.file$MINCOV
MINQUALI <- yaml.file$MINQUALI
UNITE <- yaml.file$UNITE
TILESIZE <- yaml.file$TILESIZE
STEPSIZE <- yaml.file$STEPSIZE
SIGNIDIF <- yaml.file$SIGNIDIF
QVALUE <- yaml.file$QVALUE
DESTRAND <- yaml.file$DESTRAND
merge_annot <- yaml.file$MERGE_WITH_BASICS_ANNOT
Analysis on CpG or Tiles - LEVEL : CpG
Merge reads from both strands? DESTRAND : TRUE
Minimum coverage - MINCOV : 5
Discards the bases that have more than COV.PERCth percentile of coverage - COV.PERC : 99.9
Type of unification (all or at least one per group) - UNITE : all
Threshold for significant differential methylation in % : 20
Qvalue for significant differential methylation: 0.05
head(dm_condi, n = 20)
## [[1]]
## methylRaw object with 68609 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3050120 3050120 - 1 1 0
## 2 chr19 3051793 3051793 - 1 1 0
## 3 chr19 3051860 3051860 + 1 0 1
## 4 chr19 3051902 3051902 + 1 1 0
## 5 chr19 3051907 3051907 + 1 1 0
## 6 chr19 3051940 3051940 + 1 0 1
## --------------
## sample.id: WT_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[2]]
## methylRaw object with 77141 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3051793 3051793 - 1 0 1
## 2 chr19 3051820 3051820 - 1 0 1
## 3 chr19 3051822 3051822 - 1 0 1
## 4 chr19 3051861 3051861 - 1 1 0
## 5 chr19 3051903 3051903 - 2 0 2
## 6 chr19 3051908 3051908 - 2 0 2
## --------------
## sample.id: 1KO_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[3]]
## methylRaw object with 76702 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3051819 3051819 + 1 0 1
## 2 chr19 3051821 3051821 + 1 1 0
## 3 chr19 3051860 3051860 + 1 0 1
## 4 chr19 3051903 3051903 - 2 0 2
## 5 chr19 3051908 3051908 - 1 0 1
## 6 chr19 3051914 3051914 - 1 0 1
## --------------
## sample.id: WT_2
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[4]]
## methylRaw object with 81732 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3050603 3050603 + 1 0 1
## 2 chr19 3051012 3051012 + 1 0 1
## 3 chr19 3051067 3051067 + 1 0 1
## 4 chr19 3051154 3051154 + 1 0 1
## 5 chr19 3051573 3051573 + 1 0 1
## 6 chr19 3051820 3051820 - 1 0 1
## --------------
## sample.id: 1KO_2
## assembly: mm39
## context: CpG
## resolution: base
head(f.dm_condi, n = 20)
## [[1]]
## methylRaw object with 37429 rows
## --------------
## chr start end strand coverage numCs numTs
## 14 chr19 3065878 3065878 - 6 0 6
## 34 chr19 3078956 3078956 - 7 5 2
## 35 chr19 3079413 3079413 + 7 6 1
## 36 chr19 3079414 3079414 - 6 5 1
## 40 chr19 3079526 3079526 - 8 7 1
## 41 chr19 3079664 3079664 - 6 3 3
## --------------
## sample.id: WT_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[2]]
## methylRaw object with 38271 rows
## --------------
## chr start end strand coverage numCs numTs
## 38 chr19 3078799 3078799 + 5 3 2
## 44 chr19 3078955 3078955 + 6 1 5
## 53 chr19 3079664 3079664 - 5 0 5
## 65 chr19 3080405 3080405 + 5 0 5
## 67 chr19 3080609 3080609 + 11 1 10
## 71 chr19 3081126 3081126 + 7 1 6
## --------------
## sample.id: 1KO_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[3]]
## methylRaw object with 47842 rows
## --------------
## chr start end strand coverage numCs numTs
## 39 chr19 3078603 3078603 + 7 2 5
## 40 chr19 3078604 3078604 - 5 5 0
## 41 chr19 3078649 3078649 + 5 3 2
## 42 chr19 3078650 3078650 - 9 9 0
## 50 chr19 3078956 3078956 - 6 5 1
## 51 chr19 3079413 3079413 + 7 6 1
## --------------
## sample.id: WT_2
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[4]]
## methylRaw object with 49836 rows
## --------------
## chr start end strand coverage numCs numTs
## 45 chr19 3076963 3076963 + 5 0 5
## 56 chr19 3078832 3078832 - 5 2 3
## 59 chr19 3078955 3078955 + 6 1 5
## 70 chr19 3079896 3079896 - 5 0 5
## 72 chr19 3079903 3079903 - 5 1 4
## 73 chr19 3080073 3080073 - 7 1 6
## --------------
## sample.id: 1KO_2
## assembly: mm39
## context: CpG
## resolution: base
head(methst, n = 20)
## Methstatus condi
## 1 High WT
## 2 Mid WT
## 3 Mid WT
## 4 High WT
## 5 Mid WT
## 6 Full WT
## 7 Mid WT
## 8 Mid WT
## 9 Full WT
## 10 Mid WT
## 11 High WT
## 12 Mid WT
## 13 High WT
## 14 Mid WT
## 15 High WT
## 16 High WT
## 17 High WT
## 18 High WT
## 19 High WT
## 20 High WT
head(AllDiffdm, n = 20)
## chr start end strand pvalue qvalue meth.diff coverage1
## 1 chr19 3078955 3078955 + 1.784885e-03 2.424455e-03 -60.25641 13
## 2 chr19 3080405 3080405 + 2.582334e-04 3.905317e-04 -60.00000 15
## 3 chr19 3080609 3080609 + 1.514260e-06 3.908199e-06 -62.17105 19
## 4 chr19 3083013 3083013 + 6.469067e-04 9.249703e-04 -57.77778 15
## 5 chr19 3083062 3083062 + 7.878395e-03 1.009031e-02 -40.90909 12
## 6 chr19 3084251 3084251 + 9.238221e-04 1.295069e-03 -60.00000 10
## 7 chr19 3089018 3089018 + 9.718713e-04 1.357978e-03 -47.55435 16
## 8 chr19 3089088 3089088 + 4.900669e-06 1.083457e-05 -75.00000 12
## 9 chr19 3128947 3128947 + 1.076625e-06 2.916557e-06 -80.95238 11
## 10 chr19 3128966 3128966 + 1.604463e-05 3.095537e-05 -51.23153 29
## 11 chr19 3130088 3130088 + 2.394554e-08 1.239469e-07 -76.20690 20
## 12 chr19 3130109 3130109 + 8.348034e-02 9.968535e-02 -30.00000 10
## 13 chr19 3130123 3130123 + 5.080320e-06 1.117092e-05 -63.07190 18
## 14 chr19 3130130 3130130 + 2.471412e-03 3.302028e-03 -43.49376 17
## 15 chr19 3130134 3130134 + 4.827955e-05 8.395728e-05 -57.93226 17
## 16 chr19 3130191 3130191 + 2.417288e-06 5.821344e-06 -64.00000 25
## 17 chr19 3130195 3130195 + 3.814162e-08 1.804405e-07 -70.00000 25
## 18 chr19 3130209 3130209 + 9.370910e-11 1.682302e-09 -80.95238 21
## 19 chr19 3130335 3130335 + 8.445706e-06 1.749697e-05 -69.87578 23
## 20 chr19 3130430 3130430 + 8.607506e-09 5.477421e-08 -85.71429 28
## numCs1 numTs1 coverage2 numCs2 numTs2 Methyl.Ctrl Methyl.Cond Methstatus_Ct
## 1 10 3 12 2 10 0.7692308 0.16666667 High
## 2 9 6 11 0 11 0.6000000 0.00000000 Mid
## 3 13 6 32 2 30 0.6842105 0.06250000 Mid
## 4 12 3 18 4 14 0.8000000 0.22222222 High
## 5 6 6 22 2 20 0.5000000 0.09090909 Mid
## 6 10 0 10 4 6 1.0000000 0.40000000 Full
## 7 9 7 23 2 21 0.5625000 0.08695652 Mid
## 8 9 3 15 0 15 0.7500000 0.00000000 Mid
## 9 11 0 21 4 17 1.0000000 0.19047619 Full
## 10 19 10 35 5 30 0.6551724 0.14285714 Mid
## 11 18 2 29 4 25 0.9000000 0.13793103 High
## 12 5 5 25 5 20 0.5000000 0.20000000 Mid
## 13 14 4 34 5 29 0.7777778 0.14705882 High
## 14 11 6 33 7 26 0.6470588 0.21212121 Mid
## 15 15 2 33 10 23 0.8823529 0.30303030 High
## 16 20 5 25 4 21 0.8000000 0.16000000 High
## 17 20 5 30 3 27 0.8000000 0.10000000 High
## 18 17 4 27 0 27 0.8095238 0.00000000 High
## 19 21 2 14 3 11 0.9130435 0.21428571 High
## 20 26 2 14 1 13 0.9285714 0.07142857 High
## Methstatus_Cd Diff_expr DM_status
## 1 Low Significant Hypo
## 2 Unmeth Significant Hypo
## 3 Low Significant Hypo
## 4 Low Significant Hypo
## 5 Low Significant Hypo
## 6 Mid Significant Hypo
## 7 Low Significant Hypo
## 8 Unmeth Significant Hypo
## 9 Low Significant Hypo
## 10 Low Significant Hypo
## 11 Low Significant Hypo
## 12 Low non-significant None
## 13 Low Significant Hypo
## 14 Low Significant Hypo
## 15 Mid Significant Hypo
## 16 Low Significant Hypo
## 17 Low Significant Hypo
## 18 Unmeth Significant Hypo
## 19 Low Significant Hypo
## 20 Low Significant Hypo
head(dfpc, n = 20)
## chr start end strand Meth_perc Coverage condi Diff_expr
## 1 chr19 3078955 3078955 + 0.7692308 13 WT Significant
## 2 chr19 3080405 3080405 + 0.6000000 15 WT Significant
## 3 chr19 3080609 3080609 + 0.6842105 19 WT Significant
## 4 chr19 3083013 3083013 + 0.8000000 15 WT Significant
## 5 chr19 3083062 3083062 + 0.5000000 12 WT Significant
## 6 chr19 3084251 3084251 + 1.0000000 10 WT Significant
## 7 chr19 3089018 3089018 + 0.5625000 16 WT Significant
## 8 chr19 3089088 3089088 + 0.7500000 12 WT Significant
## 9 chr19 3128947 3128947 + 1.0000000 11 WT Significant
## 10 chr19 3128966 3128966 + 0.6551724 29 WT Significant
## 11 chr19 3130088 3130088 + 0.9000000 20 WT Significant
## 12 chr19 3130109 3130109 + 0.5000000 10 WT non-significant
## 13 chr19 3130123 3130123 + 0.7777778 18 WT Significant
## 14 chr19 3130130 3130130 + 0.6470588 17 WT Significant
## 15 chr19 3130134 3130134 + 0.8823529 17 WT Significant
## 16 chr19 3130191 3130191 + 0.8000000 25 WT Significant
## 17 chr19 3130195 3130195 + 0.8000000 25 WT Significant
## 18 chr19 3130209 3130209 + 0.8095238 21 WT Significant
## 19 chr19 3130335 3130335 + 0.9130435 23 WT Significant
## 20 chr19 3130430 3130430 + 0.9285714 28 WT Significant
## DM_status Methstatus Methstatus_Ct Methstatus_Cd
## 1 Hypo High High Low
## 2 Hypo Mid Mid Unmeth
## 3 Hypo Mid Mid Low
## 4 Hypo High High Low
## 5 Hypo Mid Mid Low
## 6 Hypo Full Full Mid
## 7 Hypo Mid Mid Low
## 8 Hypo Mid Mid Unmeth
## 9 Hypo Full Full Low
## 10 Hypo Mid Mid Low
## 11 Hypo High High Low
## 12 None Mid Mid Low
## 13 Hypo High High Low
## 14 Hypo Mid Mid Low
## 15 Hypo High High Mid
## 16 Hypo High High Low
## 17 Hypo High High Low
## 18 Hypo High High Unmeth
## 19 Hypo High High Low
## 20 Hypo High High Low
head(dfpc_annotated, n = 20)
## GRanges object with 20 ranges and 9 metadata columns:
## seqnames ranges strand | Meth_perc Coverage condi Diff_expr
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <factor> <factor>
## [1] chr19 3078955 + | 0.769231 13 WT Significant
## [2] chr19 3080405 + | 0.600000 15 WT Significant
## [3] chr19 3080609 + | 0.684211 19 WT Significant
## [4] chr19 3083013 + | 0.800000 15 WT Significant
## [5] chr19 3083062 + | 0.500000 12 WT Significant
## ... ... ... ... . ... ... ... ...
## [16] chr19 3130109 + | 0.500000 10 WT non-significant
## [17] chr19 3130123 + | 0.777778 18 WT Significant
## [18] chr19 3130123 + | 0.777778 18 WT Significant
## [19] chr19 3130130 + | 0.647059 17 WT Significant
## [20] chr19 3130130 + | 0.647059 17 WT Significant
## DM_status Methstatus Methstatus_Ct Methstatus_Cd annot
## <factor> <factor> <factor> <factor> <GRanges>
## [1] Hypo High High Low chr19:1-3103070
## [2] Hypo Mid Mid Unmeth chr19:1-3103070
## [3] Hypo Mid Mid Low chr19:1-3103070
## [4] Hypo High High Low chr19:1-3103070
## [5] Hypo Mid Mid Low chr19:1-3103070
## ... ... ... ... ... ...
## [16] None Mid Mid Low chr19:3125886-3195867:-
## [17] Hypo High High Low chr19:3103071-3247732:-
## [18] Hypo High High Low chr19:3125886-3195867:-
## [19] Hypo Mid Mid Low chr19:3103071-3247732:-
## [20] Hypo Mid Mid Low chr19:3125886-3195867:-
## -------
## seqinfo: 26 sequences from mm39 genome
#################################################################
# Methylation & coverage Statistics on raw data (cpg or tiles)
dfpc_dm <- NULL
groupe <- NULL
for(i in 1:length(dm_condi)){
print(paste0(LEVEL, " raw data"))
print("sample ID")
print(dm_condi[[i]]@sample.id)
groupe <- c(groupe, rep(strsplit(dm_condi[[i]]@sample.id, "_")[[1]][1], dim(dm_condi[[i]])[1]) )
dm_condi[[i]] %>% getData() %>%
dplyr::mutate(., Meth_perc = 100 * (numCs/coverage)) %>%
dplyr::mutate(., ID = dm_condi[[i]]@sample.id) -> df_dm
dfpc_dm <- rbind(dfpc_dm, df_dm)
rm(df_dm)
}
dfpc_dm <- cbind(dfpc_dm, groupe)
#Plot of CpG Methylation on raw data (before filtering coverage and unite)
p_dm <- ggplot(dfpc_dm, aes(x=Meth_perc, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% CpG Methylation on raw data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
p_dm
rm(groupe)
cov_dm <- ggplot(dfpc_dm, aes(x=coverage, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 200, 10)) +
ggtitle(paste0("Coverage on raw data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "Coverage", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
cov_dm
rm(dfpc_dm)
#################################################################
# Methylation & coverage Statistics on filtered data
dfpc_fdm <- NULL
groupe <- NULL
for(i in 1:length(f.dm_condi)){
print("f.dm_condi, filtered data")
print("sample ID")
print(f.dm_condi[[i]]@sample.id)
groupe <- c(groupe, rep(strsplit(f.dm_condi[[i]]@sample.id, "_")[[1]][1], dim(f.dm_condi[[i]])[1]) )
f.dm_condi[[i]] %>% getData() %>%
dplyr::mutate(., Meth_perc = 100 * (numCs/coverage)) %>%
dplyr::mutate(., ID = f.dm_condi[[i]]@sample.id) -> df_fdm
dfpc_fdm <- rbind(dfpc_fdm, df_fdm)
rm(df_fdm)
}
dfpc_fdm <- cbind(dfpc_fdm, groupe)
#Plot of CpG Methylation on filtered data
p_fdm <- ggplot(dfpc_fdm, aes(x=Meth_perc, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% CpG Methylation on filtered data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
p_fdm
rm(groupe)
cov_fdm <- ggplot(dfpc_fdm, aes(x=coverage, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Coverage on filtered data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID) +
labs(x = "Coverage", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
cov_fdm
rm(dfpc_fdm)
For each condition, the plot shows the number of CpGs at different methylation status.
The methylation level is categorised into five classes:
- Full : 100% methylated
- High : between 75 and 100% excluded
- Mid : between 50 et 75% exluded
- Low : between 0 exluded and 25% excluded
- Unmeth : unmethylated
#################################################################
#Barplot Methylation Status bpst object
#requires methst object
bpst <- ggplot(data=methst ,
aes( x = condi, fill = factor( Methstatus, rev( levels(Methstatus) ) ) )
) +
geom_bar() +
labs(fill = '') +
scale_fill_manual("Methylation status : ",
values = c( 'Full' = full.color,
'High' = high.color,
'Mid' = mid.color,
'Low' = low.color,
'Unmeth' = unmeth.color)) +
ggtitle(paste0(CONDI, " Methylation status") ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5))
bpst
These plots give the proportion of differentially methylated tiles or CpG present in the three categories:
- Hyper: 1KO > WT by more than 20 % and qvalue < 0.05
- Hypo: 1KO < WT by more than 20 % and qvalue < 0.05
- None: no significant difference, qvalue > 0.05
The significance threshold of the methylation difference (20 %) is modifiable in the configuration file
as well as the threshold on the qvalue (0.05).
#################################################################
# Piechart Differentially methylated CpG or Tiles distribution
# piedm object on AllDiffdm$DM_status %>% table %>% data.frame
summary(AllDiffdm$DM_status)
## Hyper Hypo None
## 6 12888 2940
piedm <- ggplot(AllDiffdm$DM_status %>% table %>% data.frame,
aes(x="", y=Freq, fill=.)) +
geom_col() +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
coord_polar("y", start=0) +
theme_void() +
ggtitle(paste0(CONDI),
subtitle = paste0(" Differentially methylated ", LEVEL, " distribution (all)")
) +
theme(plot.title = element_text(hjust = 0.5))
piedm
# piedms object on filter(AllDiffdm, DM_status != \"None\") -> cnt
dplyr::filter(AllDiffdm, DM_status != "None") %>% dplyr::select(. , DM_status) %>% table %>% data.frame() -> cnt
piedms <- ggplot( cnt[-3,], aes(x="", y=Freq, fill=.) ) +
geom_col() +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color)
) +
coord_polar("y", start=0) +
theme_void() +
ggtitle(paste0(CONDI),
subtitle = paste0(" Differentially methylated " ,LEVEL, " distribution (Significant)")
) +
theme(plot.title = element_text(hjust = 0.5))
piedms
For each annotation track (promoters, exons …), the plots show the number of differentially methylated CpGs or tiles as a function of the methylation status in both conditions.
#################################################################
# New barplot fraction low mid high in Diff meth on All the Annots
for(i in IGV ){
dfpc_temp <- dfpc_annotated %>% data.frame
dfpc_temp <- dfpc_temp[ which( dfpc_temp$annot.type %in% i ) , ]
exist_annot <- row.names(table(dfpc_temp$annot.type))
if(is.na(match(x = i , exist_annot)) == FALSE){
dfpc_temp$annot.type <- factor(dfpc_temp$annot.type, levels = IGV[i] )
bp2ct <- ggplot( dfpc_temp, aes(x=condi, fill = DM_status)) +
geom_bar( ) +
facet_grid(~Methstatus, drop = FALSE) +
theme_bw(base_size = 14) +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
ggtitle(paste0(label = CONDI, " Differential methylation status"),
subtitle = paste0( "On annotation track ", i,
" and by ", LEVEL, " methylation level in ",
sample.ids[1])
) +
theme(
axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic")
)
print(bp2ct)
}
}
rm(dfpc_temp)
#################################################################
#Histogram of methylation all
histmeth <- ggplot(dfpc %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% of methylation, for all unified ", LEVEL)) +
facet_grid(. ~ condi, scales = "free", space = "free" ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
histmeth
#################################################################
#Histogram of methylation Signif
histmeths <- ggplot(dfpc_Sig %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% of methylation, for differentially methylated ", LEVEL)) +
facet_grid(. ~ condi, scales = "free", space = "free" ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
histmeths
#################################################################
#Histogram of methylation all by group
histmethg <- ggplot(dfpc %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Methylation distribution on all ", LEVEL, ", by group") ) +
xlab("% of methylation") + ylab("Counts") +
theme_bw(base_size = 14) +
theme(legend.title = element_blank(),
plot.title = element_text(hjust = 0.5))
histmethg
#################################################################
#Histogram of methylation Signif by group
histmethsg <- ggplot(dfpc_Sig %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color=condi)
) +
#scale_fill_manual(values=c("darkorange2", "cornflowerblue", "red", "yellow")) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Methylation distribution on differentially methylated ", LEVEL, ", by group") ) +
xlab("% of methylation") + ylab("Counts") +
theme_bw(base_size = 14) +
theme(legend.title = element_blank(),
plot.title = element_text(hjust = 0.5))
histmethsg
#################################################################
#Violin plot of methylation All
#vp2
vp2 <- ggplot(dfpc, aes(x= condi, y=Meth_perc*100, fill = condi)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.05) +
labs(title = paste0("Methylation distribution on all ", LEVEL) ,
x = "Samples", y = "% of methylation" ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
legend.position = "none")
vp2
#Violin plot of methylation Signif
#vp2s
vps2 <- ggplot(dfpc_Sig, aes(x= condi , y=Meth_perc*100, fill = condi)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.05) +
labs( title = paste0("Methylation distribution on differentially methylated ", LEVEL) ,
x = "Samples", y = "Methylation %" ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
legend.position = "none")
vps2
#################################################################
#Volcano plot
if(min(AllDiffdm$qvalue) == 0){
ymax <- .Machine$double.xmin # if qvalue == 0 , set the ylim max at the smallest representable number in R (2.225074e-308)
}else{
ymax <- min(AllDiffdm$qvalue)
}
vc <- ggplot(data = AllDiffdm,
aes(x = meth.diff, y = -log10(qvalue), col = DM_status)) +
geom_point() +
theme_bw(base_size = 14) +
labs(col = '') +
xlab("Methylation difference (%)") +
xlim(-100, 100) +
ylim(0, -log10(ymax)) +
scale_color_manual(values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
geom_vline(xintercept = c(-DIF, DIF), linetype="dashed", col="black") +
geom_hline(yintercept = -log10(QV), linetype="dashed", col="black") +
ggtitle(paste0(CONDI, " (", LEVEL, ")")) +
theme(plot.title = element_text(hjust = 0.5))
vc
for(i in IGV ){
Adtdf <- AllDiffdm_annotated %>% data.frame
Adtdf <- Adtdf[ which( Adtdf$annot.type %in% i ) , ]
exist_annot <- row.names(table(Adtdf$annot.type))
if(is.na(match(x = i , exist_annot)) == FALSE){
Adtdf$annot.type <- factor(Adtdf$annot.type, levels = IGV[i] )
vc_temp <- ggplot(data = Adtdf,
aes(x = meth.diff, y = -log10(qvalue), col = DM_status)) +
geom_point() +
theme_bw(base_size = 14) +
labs(col = '') +
xlab("Methylation difference (%)") +
xlim(-100, 100) +
ylim(0, -log10(ymax)) +
scale_color_manual(values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
geom_vline(xintercept = c(-DIF, DIF), linetype="dashed", col="black") +
geom_hline(yintercept = -log10(QV), linetype="dashed", col="black") +
ggtitle(paste0(CONDI, " (", LEVEL, "), on annotation ", i))
print(vc_temp)
}
}
rm(Adtdf)
rm(exist_annot)
# MH corrected for cases with low number of differences
if (length(AllDiffdm_annotated) > 40){
top_diff <- data.frame(AllDiffdm_annotated[AllDiffdm_annotated$pvalue <= sort(AllDiffdm_annotated$pvalue)[30] , ])
} else {
top_diff <- data.frame(AllDiffdm_annotated[AllDiffdm_annotated$pvalue <= sort(AllDiffdm_annotated$pvalue), ])
}
top_diff <- top_diff[, c(1,2,3,6,20, 28, 30)]
colnames(top_diff) <- c("Chr", "start", "end", "Pval", "Status", "Gene ID", "Annotation")
knitr::kable(top_diff , "pipe")
| Chr | start | end | Pval | Status | Gene ID | Annotation |
|---|---|---|---|---|---|---|
| chr19 | 3331096 | 3331096 | 0 | Hypo | ENSMUSG00000024831.15 | mm39_genes |
| chr19 | 3331096 | 3331096 | 0 | Hypo | ENSMUSG00000024829.13 | mm39_near_tss |
| chr19 | 3331096 | 3331096 | 0 | Hypo | NA | mm39_introns |
| chr19 | 3331096 | 3331096 | 0 | Hypo | NA | mm39_cpg_shores |
| chr19 | 3331096 | 3331096 | 0 | Hypo | ENSMUSG00000024829.13 | mm39_promoters |
| chr19 | 3360265 | 3360265 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 3399011 | 3399011 | 0 | Hypo | ENSMUSG00000024900.6 | mm39_genes |
| chr19 | 3399011 | 3399011 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4077308 | 4077308 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 4522770 | 4522770 | 0 | Hypo | ENSMUSG00000049303.11 | mm39_genes |
| chr19 | 4522770 | 4522770 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4522770 | 4522770 | 0 | Hypo | NA | mm39_cpg_shelves |
| chr19 | 4539000 | 4539000 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 4629628 | 4629628 | 0 | Hypo | ENSMUSG00000024892.18 | mm39_genes |
| chr19 | 4629628 | 4629628 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4630731 | 4630731 | 0 | Hypo | ENSMUSG00000024892.18 | mm39_genes |
| chr19 | 4630731 | 4630731 | 0 | Hypo | NA | mm39_introns |
| chr19 | 5050563 | 5050563 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 5050563 | 5050563 | 0 | Hypo | NA | mm39_cpg_shores |
| chr19 | 5239781 | 5239781 | 0 | Hypo | ENSMUSG00000024855.11 | mm39_genes |
| chr19 | 5239781 | 5239781 | 0 | Hypo | NA | mm39_introns |
| chr19 | 5455076 | 5455076 | 0 | Hypo | ENSMUSG00000086938.10 | mm39_near_tss |
| chr19 | 5455076 | 5455076 | 0 | Hypo | ENSMUSG00000039330.6 | mm39_near_tss |
| chr19 | 5455076 | 5455076 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 5455076 | 5455076 | 0 | Hypo | ENSMUSG00000086938.10 | mm39_promoters |
| chr19 | 5585517 | 5585517 | 0 | Hypo | ENSMUSG00000109695.2 | mm39_genes |
| chr19 | 5585517 | 5585517 | 0 | Hypo | NA | mm39_introns |
| chr19 | 5601392 | 5601392 | 0 | Hypo | ENSMUSG00000024922.3 | mm39_genes |
| chr19 | 5601392 | 5601392 | 0 | Hypo | NA | mm39_introns |
| chr19 | 5665794 | 5665794 | 0 | Hypo | NA | mm39_intergenic |
rm(top_diff)
if(LEVEL == "Tiles"){
df_alldiff <- data.frame(AllDiffdm_annotated)
sort_alldif_annot <- df_alldiff[order(df_alldiff$pvalue, decreasing=FALSE),]
sort_annotated_genes <- sort_alldif_annot[ sort_alldif_annot$annot.type == paste0(ORG, "_genes"),]
background <- sort_annotated_genes[!duplicated(sort_annotated_genes[, "start"]),]
write.csv(background, file= paste0(OUTPUT_PATH , CONDI, "_background_tiles.csv"))
sort_sig_annotated_genes <- sort_annotated_genes[sort_annotated_genes$Diff_expr == "Significant", ]
write.csv(sort_sig_annotated_genes, file= paste0(OUTPUT_PATH , CONDI, "_significant_annot_tiles.csv"))
print(table(sort_sig_annotated_genes$DM_status))
}
Show the differential methylation status :
- Hyper
- Hypo
- None
On basic genomic annotation tracks.
#################################################################
#plot categorical
dm_annotated <- AllDiffdm_annotated %>% data.frame
dm_annotated$annot.type <- factor(dm_annotated$annot.type , levels = IGV )
anno_re <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar(position = "fill") +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", IGV ) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (relative counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 12, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
anno_re
anno_abs <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar() +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", IGV) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (absolute counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
anno_abs
if((CUSTOM_ANNOT == TRUE) && (merge_annot == FALSE)){
library(GenomicRanges)
library(annotatr)
df_custom_tracks <- data.frame(list_custom_tracks[[1]])
for(i in 2:length(list_custom_tracks)){
df <- data.frame(list_custom_tracks[[i]])
df_custom_tracks <- rbind(df_custom_tracks, df)
}
GR_custom_tracks <- makeGRangesFromDataFrame(df_custom_tracks, keep.extra.columns = TRUE)
for(i in 1:length(unique(meta_annot$group))){
print(i)
AllDiffdm_annotated <- annotate_regions(regions = AllDiffdm_regions,
annotations = GR_custom_tracks[GR_custom_tracks$group == unique(meta_annot$group)[i] ],
ignore.strand = TRUE,
quiet = TRUE)
dm_annotated <- AllDiffdm_annotated %>% data.frame
dm_annotated$annot.type <- factor(dm_annotated$annot.type )
anno_re_custom <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar(position = "fill") +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", dm_annotated$annot.type ) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (relative counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
print(anno_re_custom)
anno_abs_custom <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar() +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", dm_annotated$annot.type) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (absolute counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
print(anno_abs_custom)
}
rm(df_custom_tracks)
rm(GR_custom_tracks)
rm(dm_annotated)
rm(AllDiffdm_annotated)
}
For each chromosome show the number of DMCs/DMTs hypo or hyper-methylated
The category named “others” correspond to chromsomes with unconventional names (like Unmapped chromsomes for example)
if( dim(dfpc_Sig)[1] > 0){
count_per_chromosome <- table(dfpc_Sig$chr, dfpc_Sig$DM_status)
count_df <- as.data.frame.matrix(count_per_chromosome)
count_df$chr <- rownames(count_df)
# test si absence de hypo ou hyper
if(dim(count_df)[2] == 2){
if(colnames(count_df[1]) == "hypo"){
count_df$Hyper <- rep(0, dim(count_df)[1])
}else{
count_df$Hypo <- rep(0, dim(count_df)[1])
}
}
chr_others <- count_df[1,]
rownames(chr_others) <- "others"
chr_others$Hypo <- chr_others$Hyper <- 1
chr_others$chr <- "others"
delete_row <- 0
for(i in 1:length(count_df$chr)){
if((nchar(count_df$chr[i]) > 5) == TRUE){
chr_others$Hypo <- (chr_others$Hypo + count_df[i,]$Hypo)
chr_others$Hyper <- (chr_others$Hyper + count_df[i,]$Hyper)
delete_row <- c(delete_row, i)
}
}
if(is.na(delete_row[2]) == FALSE){ # test si il n'y a aucune colonne à remove (delete_row = 0), car sinon génère des bugs
count_df <- count_df[-delete_row,]
}
count_df<- rbind(count_df, chr_others)
count_df$chr <- str_sort(count_df$chr, numeric = TRUE)
count_df_long <- tidyr::gather(count_df, key = "status", value = "count", Hypo, Hyper)
hist_dmr_chr <- ggplot(count_df_long, aes(x = chr, y = count, fill = status)) +
geom_bar(position = "dodge" , stat = "identity", colour = "black") +
labs(title = paste0("Number of differentially methylated ", LEVEL, " by chromosome"),
x = "Chromosome",
y = LEVEL) +
theme_bw(base_size = 14) +
scale_fill_manual("", values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
rm(delete_row)
rm(count_df, count_df_long)
rm(chr_others)
hist_dmr_chr
}
Saves the whole R session of the script (variables, objects …)
at the end of its execution. The RData file produced can then be opened in a local environment (if you have enough space)
to make modifications. For example to retouch figures.
save.image(file = PLOT_RDATA)
# R Session
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
##
## Matrix products: default
## BLAS/LAPACK: /opt/conda/envs/wgbsflow/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] stringr_1.5.0 methylKit_1.20.0 GenomicRanges_1.46.1
## [4] GenomeInfoDb_1.30.1 IRanges_2.28.0 S4Vectors_0.32.4
## [7] BiocGenerics_0.40.0 magrittr_2.0.3 dbplyr_2.3.0
## [10] ggplot2_3.3.6 yaml_2.3.5
##
## loaded via a namespace (and not attached):
## [1] MatrixGenerics_1.6.0 Biobase_2.54.0
## [3] tidyr_1.3.0 sass_0.4.5
## [5] jsonlite_1.8.4 splines_4.1.3
## [7] R.utils_2.12.2 gtools_3.9.4
## [9] bslib_0.4.2 assertthat_0.2.1
## [11] highr_0.10 GenomeInfoDbData_1.2.7
## [13] Rsamtools_2.10.0 numDeriv_2016.8-1.1
## [15] pillar_1.8.1 lattice_0.20-45
## [17] glue_1.6.2 limma_3.50.3
## [19] bbmle_1.0.25 digest_0.6.31
## [21] XVector_0.34.0 qvalue_2.26.0
## [23] colorspace_2.1-0 htmltools_0.5.4
## [25] Matrix_1.5-3 R.oo_1.25.0
## [27] plyr_1.8.8 XML_3.99-0.11
## [29] pkgconfig_2.0.3 emdbook_1.3.12
## [31] zlibbioc_1.40.0 purrr_1.0.1
## [33] mvtnorm_1.1-3 scales_1.2.1
## [35] BiocParallel_1.28.3 tibble_3.1.8
## [37] farver_2.1.1 generics_0.1.3
## [39] SummarizedExperiment_1.24.0 cachem_1.0.6
## [41] withr_2.5.0 cli_3.6.0
## [43] crayon_1.5.2 mclust_6.0.0
## [45] evaluate_0.20 R.methodsS3_1.8.2
## [47] fansi_1.0.4 MASS_7.3-58.2
## [49] tools_4.1.3 data.table_1.14.2
## [51] matrixStats_0.63.0 BiocIO_1.4.0
## [53] lifecycle_1.0.3 munsell_0.5.0
## [55] DelayedArray_0.20.0 Biostrings_2.62.0
## [57] compiler_4.1.3 jquerylib_0.1.4
## [59] fastseg_1.40.0 rlang_1.0.6
## [61] grid_4.1.3 RCurl_1.98-1.10
## [63] rjson_0.2.21 labeling_0.4.2
## [65] bitops_1.0-7 rmarkdown_2.20
## [67] restfulr_0.0.15 gtable_0.3.1
## [69] DBI_1.1.3 reshape2_1.4.4
## [71] R6_2.5.1 GenomicAlignments_1.30.0
## [73] knitr_1.42 dplyr_1.0.10
## [75] rtracklayer_1.54.0 bdsmatrix_1.3-6
## [77] fastmap_1.1.0 utf8_1.2.3
## [79] stringi_1.7.12 parallel_4.1.3
## [81] Rcpp_1.0.10 vctrs_0.5.2
## [83] tidyselect_1.2.0 xfun_0.37
## [85] coda_0.19-4